#!/usr/bin/env python
# Assignment 5 answer template.
# Updated Sat Oct 27, 2012 at 8:48
# http://web.mit.edu/6.863/fall2012/writeups/assignment5/assignment5_template.py
#
# Fill in your answers here, in the variables that end with _ANSWER.
# The format of the desired answers are described in the variables that end with _FORMAT. Don't deviate from this format.
#
# Before submitting, verify that you have filled out the questions and
# your submission follows the desired format by running this file:
# python assignment5_template.py

# what is your name?
what_is_my_name_ANSWER = 'Peng Wang'
what_is_my_name_FORMAT = 'freeform text' # can be an arbitrary string, no format restrictions

# what are the names of your collaborators, if any?
who_are_my_collaborators_ANSWER = ['Ravi Netravali']
who_are_my_collaborators_FORMAT = 'list of strings' # ie, ['John Doe', 'Jane Doe']

# === question 1: stack-based parsing ===

# parse tree for 'my dog saw a man in the park with a statue' generated by making manual shift-reduce decisions
parse_tree_generated_ANSWER = '''
(S (NP (Det my) (N dog)) 
   (VP (VP (VP (V saw) 
               (NP (Det a) (N man))) 
           (PP (P in) 
               (NP (Det the) (N park)))) 
       (PP (P with) (NP (Det a) (N statue)))))
'''
parse_tree_generated_FORMAT = 'tree' # a parse tree, of the standard '(S (NP (Det the) (N dog) ) (VP (V ate) ) )' format. Don't worry about spaces, they'll be removed before verification

# how would you characterize the strategy that the parser is automatically using? Is it 'shift > reduce' or 'reduce > shift'?
strategy_used_by_parser_ANSWER = 'reduce > shift'
strategy_used_by_parser_FORMAT = 'parse strategy' # 'reduce > shift' or 'shift > reduce'

# which strategy leads to bushy parse trees?
strategy_leading_to_bushy_parse_trees_ANSWER = 'reduce > shift'
strategy_leading_to_bushy_parse_trees_FORMAT = 'parse strategy'

# which strategy leads to straggly parse trees?
strategy_leading_to_straggly_parse_trees_ANSWER = 'shift > reduce'
strategy_leading_to_straggly_parse_trees_FORMAT = 'parse strategy'

# give a complete parse tree for the sentence 'my dog saw a man in the park with a statue' that is as bushy as possible, and one that is as straggly as possible
most_bushy_parse_tree_ANSWER = '''
(S (NP (Det my) (N dog)) 
   (VP (VP (VP (V saw) 
               (NP (Det a) (N man))) 
           (PP (P in) 
               (NP (Det the) (N park)))) 
       (PP (P with) (NP (Det a) (N statue)))))
'''
most_bushy_parse_tree_FORMAT = 'tree'

most_straggly_parse_tree_ANSWER = '''
(S (NP (Det my) (N dog))
   (VP (V saw)
       (NP (NP (Det a) (N man))
           (PP (P in)
               (NP (NP (Det the) (N park))
                   (PP (P with)
                       (NP (Det a) (N statue))))))))
'''
most_straggly_parse_tree_FORMAT = 'tree'

# do people's preference for 'my dog saw a man in the park with a statue on the grass behind the fence' follow the 'shift > reduce' or 'reduce > shift' strategy?
peoples_preference_for_this_sentence_ANSWER = 'shift > reduce'
peoples_preference_for_this_sentence_FORMAT = 'parse strategy'

# explain this in terms of what they put on the stack
peoples_preference_explanation_ANSWER = '''
My parse is that "the man is in the park",  "the park is with a statue", "the man is on the grassa", and "the grass is behind the fence". For all successful parses there are only 4 decision points, namely before in, with, on and behind. For three of them, I choose shift; for the other one, I choose reduce. In this sense it is shift > reduce.
'''
peoples_preference_explanation_FORMAT = 'freeform text'

# Is it possible to write a standard LR parser without lookahead that will obtain the desired parses for both of these sentences, using our toy grammar? 
does_lr_parser_without_lookahead_work_ANSWER = False
does_lr_parser_without_lookahead_work_FORMAT = 'boolean' # True or False

# How about an LALR(1) parser?
does_lalr1_parser_work_ANSWER = True
does_lalr1_parser_work_FORMAT = 'boolean'

# How about an LALR(2) parser?
does_lalr2_parser_work_ANSWER = True
does_lalr2_parser_work_FORMAT = 'boolean'

# === question 2: scaling up using larger grammars ===

# what is the size of the wsj.cfg grammar
grammar_size_ANSWER = 26413
grammar_size_FORMAT = 'integer' # 0, 1, etc

# how many parse trees are there for the first, simple sentence: 'John is happy'?
num_parse_trees_for_sentence_ANSWER = 33955
num_parse_trees_for_sentence_FORMAT = 'integer'

# are there any of the first 100 trees that correspond to what the 'correct' parse for 'John is happy' ought to be?
is_correct_parse_tree_present_ANSWER = False
is_correct_parse_tree_present_FORMAT = 'boolean'

# if your answer to the previous question was True, what is the [zero-based] index of the correct parse in the list of parses? [If it isn't among the first 100 parse trees, just enter -1]
index_of_correct_parse_tree_ANSWER = -1
index_of_correct_parse_tree_FORMAT = 'integer'

# === question 3: probabilistic context-free parsing

# what are the rules for expanding the nonterminal 'VP' and their associated probabilities?
rules_for_expanding_VP_ANSWER = {'V1 SBAR' : 1.0/3, 'V2' : 1.0/3, 'VP ADVP' : 1.0/3}
rules_for_expanding_VP_FORMAT = 'dictionary of rule probabilities'  # dictionary mapping elements rule expands to -> probability. ex: {'V PP': 0.1, 'V': 0.9} if the grammar is VP -> V PP [0.1]; VP -> V [0.9]. Order of elements in the dictionary doesn't matter.

# find the two most-likely parses for 'Jeff pronounced that Fred snored loudly', and their probabilities
most_likely_parse_ANSWER = '''
(S
  (NP Jeff)
  (VP
    (V1 pronounced)
    (SBAR
      (COMP that)
      (S (NP Fred) (VP (VP (V2 snored)) (ADVP loudly))))))

'''
most_likely_parse_FORMAT = 'tree'

most_likely_parse_probability_ANSWER = 3.8103947569e-05
most_likely_parse_probability_FORMAT = 'probability'

second_most_likely_parse_ANSWER = '''
(S
  (NP Jeff)
  (VP
    (VP
      (V1 pronounced)
      (SBAR (COMP that) (S (NP Fred) (VP (V2 snored)))))
    (ADVP loudly)))
'''
second_most_likely_parse_FORMAT = 'tree'

second_most_likely_parse_probability_ANSWER = 3.8103947569e-05
second_most_likely_parse_probability_FORMAT = 'probability'

# getting rid of ambiguity by small changes to the corpus. What change did you make?
change_made_to_remove_ambiguity_ANSWER = 'I changed the lower S to a new nonterminal Slow, and the VP above the ADVP to a new nonterminal VPlow'
change_made_to_remove_ambiguity_FORMAT = 'freeform text'

# What is the probability of the correct parse tree now?
correct_parse_tree_probability_after_change_ANSWER = 0.000257201646091
correct_parse_tree_probability_after_change_FORMAT = 'probability'

# train a probabilistic grammar from the 10% wsj sample. How many rules does the grammar have?
number_of_rules_in_grammar_ANSWER = 27783
number_of_rules_in_grammar_FORMAT = 'integer'

# retrain your grammar using markov order-1 smoothing. How many rules does the grammar now have?
number_of_rules_in_retrained_grammar_ANSWER = 21845
number_of_rules_in_retrained_grammar_FORMAT = 'integer'

# use your smoothed gramar to parse these sentences. Report the number of constituents in the reference parse (as given in figure 1 of the handout), the number of constituents in the parse by the gramar you trained, and the number of constituents in the intersection (number of shared constituents).

# The luxury auto maker last year sold 1,000 cars in the U.S.
sentence1_num_constituents_ground_truth_ANSWER = 19
sentence1_num_constituents_ground_truth_FORMAT = 'integer'

sentence1_num_constituents_parsed_by_your_grammar_ANSWER = 19
sentence1_num_constituents_parsed_by_your_grammar_FORMAT = 'integer'

sentence1_num_constituents_shared_ANSWER = 15
sentence1_num_constituents_shared_FORMAT = 'integer'

# Bell Industries Inc. increased its quarterly to 10 cents from seven cents a share .
sentence2_num_constituents_ground_truth_ANSWER = 25
sentence2_num_constituents_ground_truth_FORMAT = 'integer'

sentence2_num_constituents_parsed_by_your_grammar_ANSWER = 26
sentence2_num_constituents_parsed_by_your_grammar_FORMAT = 'integer'

sentence2_num_constituents_shared_ANSWER = 19
sentence2_num_constituents_shared_FORMAT = 'integer'

# The dispute shows clearly the global power of Japan 's financial titans .
sentence3_num_constituents_ground_truth_ANSWER = 22
sentence3_num_constituents_ground_truth_FORMAT = 'integer'

sentence3_num_constituents_parsed_by_your_grammar_ANSWER = 21
sentence3_num_constituents_parsed_by_your_grammar_FORMAT = 'integer'

sentence3_num_constituents_shared_ANSWER = 20
sentence3_num_constituents_shared_FORMAT = 'integer'

# Was this why some of the audience departed before or during the second half ?
sentence4_num_constituents_ground_truth_ANSWER = 27
sentence4_num_constituents_ground_truth_FORMAT = 'integer'

# why can't the current grammar parse the fourth sentence? you should observe at least two problems and suggest fixes for each
why_cant_current_grammar_parse_sentence4_ANSWER = '''
Problem 1: PP-TMP does not have a rule : PP-TMP -> IN CC IN NP
Problem 2: SQ does not have a rule : SQ -> VBD NP-SBJ SBAR-PRD
Solution: the best_parse parser should not require that every span of the input have a parse. It can make a fake_parse mark there and assign to it a very low score to ensure that it will not be used in upper parses. Also add the missing rules to the gramma.
'''
why_cant_current_grammar_parse_sentence4_FORMAT = 'freeform text'

# === END OF YOUR SUBMISSION, DON'T MODIFY THIS LINE OR ANYTHING BELOW IT ===

# The following code does the verification that your submission is correctly formatted and that you have filled out all the questions.

def have_format_error(answer, fmt):
  if fmt == 'integer':
    if type(answer) != type(0):
      return 'Must be an integer, ex: 1'
    return None
  elif fmt == 'boolean':
    if answer not in [True, False]:
      return 'Must be a boolean, ex: True'
    return None
  elif fmt == 'probability':
    if type(answer) != type(0.0):
      return 'Must be a float, ex: 0.0'
    if not (0.0 <= answer <= 1.0):
      return 'Probability must be between 0.0 and 1.0, ex: 0.5'
    return None
  elif fmt == 'list of strings':
    if type(answer) != type([]):
      return "Must be a list, ex: ['John Doe']"
    if len([x for x in answer if type(x) != type('')]) > 0:
      return "Elements in the list must be strings, ex: ['John Doe']"
    return None
  elif fmt == 'dictionary of rule probabilities':
    if type(answer) != type({}):
      return "Must be a dictionary, ex: {'V PP': 0.1, 'V': 0.9}"
    if len([x for x in answer.keys() if type(x) != type('')]) > 0:
      return "Keys in the dictionary must be strings, ex: {'V PP': 0.1, 'V': 0.9}"
    if len([x for x in answer.values() if type(x) != type(0.0)]) > 0:
      return "Values in the dictionary must be floating point numbers, ex: {'V PP': 0.1, 'V': 0.9}"
    if False in [0.0 <= x <= 1.0 for x in answer.values()]:
      return "Values in the dictionary must each be between 0.0 and 1.0, ex: {'V PP': 0.1, 'V': 0.9}"
    if not (0.99 <= sum(answer.values()) <= 1.01):
      return "Values in the dictionary must sum to 1.0, ex: {'V PP': 0.1, 'V': 0.9}"
    return None
  elif fmt in ['freeform text', 'tree', 'parse strategy']: # strings
    if type(answer) != type(''):
      return "Must be a string"
    answer = answer.strip()
    if fmt == 'tree':
      if '(' not in answer:
        return "Parse tree needs to have at least 1 '('"
      if ')' not in answer:
        return "Parse tree needs to have at least 1 ')'"
      if answer[0] != '(':
        return "Parse tree needs to start with '('"
      if answer[-1] != ')':
        return "Parse tree needs to end with ')'"
      if answer.count('(') != answer.count(')'):
        return "Parse tree need to have an equal number of '(' and ')'"
      if index_where_parse_tree_is_closed(answer) != len(answer) - 1:
        return 'Answer should be a single complete parse tree; you either have multiple trees, or your tree was prematurely closed'
      return None
    if fmt == 'parse strategy':
      if answer not in ['shift > reduce', 'reduce > shift']:
        return "Parse strategy must be either 'shift > reduce' or 'reduce > shift'"
      return None
    if fmt == 'freeform text':
      if len(answer) < 1:
        return 'You need to write something'
      return None
  return "The declared answer format '" + fmt + "' is not recognized; you changed _FORMAT for this question"

def index_where_parse_tree_is_closed(parse_tree):
  if len(parse_tree) == 0 or parse_tree[0] != '(':
    return -1
  parens_to_close = 1
  for idx in range(1, len(parse_tree)):
    c = parse_tree[idx]
    if c == '(':
      parens_to_close += 1
    elif c == ')':
      parens_to_close -= 1
      if parens_to_close <= 0:
        return idx
  return -1

if __name__ == '__main__':
  question_names = locals().keys()
  question_names = [x for x in question_names if '__' not in x]
  question_names = [x for x in question_names if '_ANSWER' in x]
  question_names = [x.replace('_ANSWER', '') for x in question_names]
  question_formats = {}
  for x in question_names:
    question_formats[x] = locals()[x + '_FORMAT']
  question_answers = {}
  for x in question_names:
    question_answers[x] = locals()[x + '_ANSWER']
  unanswered_questions = set()
  for question,answer in question_answers.items():
    if answer == 'FILLIN':
      print 'Need to answer question: ' + question
      unanswered_questions.add(question)
  for question,fmt in question_formats.items():
    if question in unanswered_questions:
      continue
    format_error = have_format_error(question_answers[question], fmt)
    if format_error:
      print 'Not in correct format: ' + question + ': ' + format_error

